#This replication file prepares the data for the European analysis.
#1. Load a dataset provided by the University of Groningen with information on income levels.
#2. Calculate the area of each administrative unit based on a shapefile containing European municipalities.
#3. Calculate the relevant demographic variables based on data provided by censusHub.
#4. Merge all 3 datasets (income, area and demographics).
#5. Load the data from the European notary website and perform a spatial merge in order to calculate the number of notaries per census unit.

#Clear working space and set working directory
rm(list=ls())
setwd("C:/PATH_TO_REPLICATION_FILES/Data_Europe")

require(rgdal) #This package reads data from ArcGIS
require(plyr) #This package renames variables with-in a dataset
library(raster) #This package calculates surface area
library(readr) #This package reads csv files
require(maptools)
require(sp)
require(tibble)
require(spatialEco)
require(mgcv) #This package finds unique addresses 
require(sf) #This package performs the spatial merge
require(foreign) #This package is exports data into Stata.

########################
#1. Income information #
########################

#Define the countries in Europe which are used in the analysis.
EU_8 = c("AT","BE","FR","DE","IT","NL","PT","ES")

# Read the income shapefile from the directory ("Income_NUTS3")
Income_NUTS3 <- readOGR(dsn = "Income_NUTS3", layer = "Income_NUTS3")
#The first 2 characters of the NUTS_IDs correspond to each country's code.
#They are stored in a separate variable named Country.
Income_NUTS3$Country<-substr(Income_NUTS3$NUTS_ID,start=1,stop=2)

#The variable NUTS_ID in the dataset is renamed NUTS3.
Income_NUTS3<-rename(Income_NUTS3,c("NUTS_ID"="NUTS3"))
#Select only those countries which are relevant for our analysis.
Income_NUTS3<-Income_NUTS3[is.element(Income_NUTS3$Country,EU_8)==TRUE,]
#Save the data as an RData file.
save(Income_NUTS3,file="Generated_data/Income_NUTS3.RData")

################################
#2. Calculating municipal area #
################################

#Read SHAPEFILE.shp from the current working directory (".")
CensusUnits <- readOGR(dsn = "CENSUS_UNITS_2011_RG/data/", layer = "CENSUS_UNIT_RG_01M_2011")

#Make sure to check that you are using a projection which is in lonlat format.
crs(CensusUnits)
#Calculate surface area
CensusUnits$Area_sqkm <- area(CensusUnits) / 1000000

#The first 2 characters of the CENSUS IDs correspond to each country's code.
#They are stored in a separate variable named Country.
CensusUnits$Country<-substr(CensusUnits$CENSUS_ID,start=1,stop=2)
#Select only those countries which are relevant for our analysis.
CensusUnits<-CensusUnits[is.element(CensusUnits$Country,EU_8)==TRUE,]
CensusUnits$Shape_Area<-NULL
CensusUnits$Shape_Leng<-NULL
#Save the data as an RData file.
save(CensusUnits,file="Generated_data/CensusUnits.RData")

##############################
# 3. Census data (CensusHub) #
##############################
#Create an empty dataset which will be filled with the information from all countries
data_EU<-data.frame(GEO=character(),SEX=character(),AGE=character(),TIME=integer(),VALUE=integer())

#Load the CensusHub data
#Save all the names of the files in the CensusHub county by country dataset
file_names<-list.files(path="CensusHub",pattern=".csv$")

#Country by country analysis
for(i in 1:length(file_names)) { 
  file_name<-paste("CensusHub/",file_names[i],sep="")
  print(file_name)
  data <- read_csv(file_name)
  #Take the first 5 variables
  data<-data[,1:5]
  data$VALUE_original<-data$VALUE
  data$VALUE<-as.integer(data$VALUE)
  data_EU<-rbind(data_EU,data)
  rm(data)
} 

#Drop all observations that are at the country level or at the NUTS3 level
data_EU<-data_EU[nchar(data_EU$GEO)>5,]
#Drop the time variable (as it is 2011 for all observations)
data_EU$TIME<-NULL
#Generate a country identifier
data_EU$Country<-substr(data_EU$GEO,start=1,stop=2)
#There are observations with no population and no data. We drop them.
data_EU<-data_EU[data_EU$GEO!="SEZZZ_ZZ"&data_EU$GEO!="SKZZZ_ZZ"&data_EU$GEO!="ROZZZ_ZZ"&data_EU$GEO!="PTZZZ_ZZ"&data_EU$GEO!="PLZZZ_ZZ"&data_EU$GEO!="NLZZZ_ZZ"&data_EU$GEO!="MTZZZ_MTZZ"&data_EU$GEO!="LVZZZ_ZZ"&data_EU$GEO!="ISZZZ_ZZ"&data_EU$GEO!="HRZZZ_ZZ"&data_EU$GEO!="ESZZZ_ZZ"&data_EU$GEO!="LIZZZ_LIZZ"&data_EU$GEO!="FIZZZ_ZZ"&data_EU$GEO!="CHZZZ_ZZ"&data_EU$GEO!="BEZZZ_ZZ"&data_EU$GEO!="CZZZZ_ZZ"&data_EU$GEO!="ATZZZ_ZZ"&data_EU$GEO!="DKZZZ_ZZ"&data_EU$GEO!="IEZZZ_ZZ"&data_EU$GEO!="EEZZZ_ZZ"&data_EU$GEO!="ITZZZ_ZZ",]

#Create a dataset which has a unique row for each CensusHub Unit
CensusHub<-as.data.frame(table(data_EU$GEO))
#Each CensusHub ID should appear 14 times in the data (as we collect 14 distinct pieces of information)
table(CensusHub$Freq)
CensusHub$Freq<-NULL

#Rename the first variable in CensusHub Census ID
CensusHub<-rename(CensusHub,c("Var1"="CENSUS_ID"))
#Generate a variable with the country code
CensusHub$Country<-substr(CensusHub$CENSUS_ID,start=1,stop=2)

#Assign the values
CensusHub<-merge(CensusHub,data_EU[data_EU$AGE=="TOTAL" & data_EU$SEX=="T",c(which(colnames(data_EU)=="VALUE"),which(colnames(data_EU)=="GEO"))],by.x="CENSUS_ID",by.y="GEO")
CensusHub<-rename(CensusHub,c("VALUE"="Population"))
CensusHub<-merge(CensusHub,data_EU[data_EU$AGE=="TOTAL" & data_EU$SEX=="M",c(which(colnames(data_EU)=="VALUE"),which(colnames(data_EU)=="GEO"))],by.x="CENSUS_ID",by.y="GEO")
CensusHub<-rename(CensusHub,c("VALUE"="Male"))
CensusHub<-merge(CensusHub,data_EU[data_EU$AGE=="Y_LT15" & data_EU$SEX=="T",c(which(colnames(data_EU)=="VALUE"),which(colnames(data_EU)=="GEO"))],by.x="CENSUS_ID",by.y="GEO")
CensusHub<-rename(CensusHub,c("VALUE"="age_below_15"))
CensusHub<-merge(CensusHub,data_EU[data_EU$AGE=="Y15-29" & data_EU$SEX=="T",c(which(colnames(data_EU)=="VALUE"),which(colnames(data_EU)=="GEO"))],by.x="CENSUS_ID",by.y="GEO")
CensusHub<-rename(CensusHub,c("VALUE"="age_15_29"))
CensusHub<-merge(CensusHub,data_EU[data_EU$AGE=="Y30-49" & data_EU$SEX=="T",c(which(colnames(data_EU)=="VALUE"),which(colnames(data_EU)=="GEO"))],by.x="CENSUS_ID",by.y="GEO")
CensusHub<-rename(CensusHub,c("VALUE"="age_30_49"))
CensusHub<-merge(CensusHub,data_EU[data_EU$AGE=="Y50-64" & data_EU$SEX=="T",c(which(colnames(data_EU)=="VALUE"),which(colnames(data_EU)=="GEO"))],by.x="CENSUS_ID",by.y="GEO")
CensusHub<-rename(CensusHub,c("VALUE"="age_50_64"))
CensusHub<-merge(CensusHub,data_EU[data_EU$AGE=="Y65-84" & data_EU$SEX=="T",c(which(colnames(data_EU)=="VALUE"),which(colnames(data_EU)=="GEO"))],by.x="CENSUS_ID",by.y="GEO")
CensusHub<-rename(CensusHub,c("VALUE"="age_65_84"))
CensusHub<-merge(CensusHub,data_EU[data_EU$AGE=="Y_GE85" & data_EU$SEX=="T",c(which(colnames(data_EU)=="VALUE"),which(colnames(data_EU)=="GEO"))],by.x="CENSUS_ID",by.y="GEO")
CensusHub<-rename(CensusHub,c("VALUE"="age_over_85"))
#Select only the countries used in the analysis
CensusHub<-CensusHub[is.element(CensusHub$Country,EU_8)==TRUE,]
table(CensusHub$Country,CensusHub$Population==0)

#Calculate demographic variables
CensusHub$Perc_Male<-CensusHub$Male/CensusHub$Population
CensusHub$Perc_Young<-CensusHub$age_below_15/CensusHub$Population
CensusHub$Perc_Elderly<-(CensusHub$age_65_84+CensusHub$age_over_85)/CensusHub$Population

#Remove unnecessary variables/datasets
rm(data_EU,file_name,file_names,i)
CensusHub<-CensusHub[is.element(CensusHub$CENSUS_ID,c("BGZZZ_ZZ","DEZZZ_ZZ","FRZZZ_ZZ","LTZZZ_ZZ","LUZZZ_ZZ"))==FALSE,]

#Generate a NUTS3 identifier
CensusHub$NUTS3<-substr(CensusHub$CENSUS_ID,start=1,stop=5)

#Merge with information on surface area
CensusUnits<-merge(CensusUnits,CensusHub[,!(names(CensusHub) %in% c("Country"))],by.x="CENSUS_ID",by.y="CENSUS_ID")

#Show which NUTS3 regions are missing in the income data
setdiff(CensusHub$NUTS3,Income_NUTS3$NUTS3)
Income_NUTS3<-Income_NUTS3[,c("NUTS3","INCOME")]

#Merge with income information at the NUTS3 level
CensusUnits<-merge(CensusUnits,Income_NUTS3,by=NUTS3)
rm(CensusHub,Income_NUTS3)
CensusUnits$CENSUS_ID<-as.factor(CensusUnits$CENSUS_ID)
save.image("Generated_data/CensusHub.RData")

crs_census<-crs(CensusUnits)

###################################################
# 4. Notary information from the European website #
###################################################
load("European_website_data.RData")
coordinates(notary_data) <- ~lat+lon
proj4string(notary_data)<-CRS("+proj=longlat +datum=WGS84")
notary_data <- spTransform(notary_data, crs_census) 
CensusUnits$Income_NA<-as.numeric(is.na(CensusUnits$INCOME))
CensusUnits$INCOME[is.na(CensusUnits$INCOME)]<-0
CensusUnits$Perc_Male_NA<-as.numeric(is.na(CensusUnits$Perc_Male))
CensusUnits$Perc_Male[is.na(CensusUnits$Perc_Male)]<-0
CensusUnits$Perc_Elderly_NA<-as.numeric(is.na(CensusUnits$Perc_Elderly))
CensusUnits$Perc_Elderly[is.na(CensusUnits$Perc_Elderly)]<-0
CensusUnits$Perc_Young_NA<-as.numeric(is.na(CensusUnits$Perc_Young))
CensusUnits$Perc_Young[is.na(CensusUnits$Perc_Young)]<-0
CensusUnits$lnpop<-log(CensusUnits$Population)

#Country by country analysis
notary_data$country<-as.factor(notary_data$country)

for (i in c(1:8)) {
  selected_country<-EU_8[i] 
  print(EU_8[i])
  notaries_country<-notary_data[notary_data$country==selected_country,]
  municipalities_country<-CensusUnits[CensusUnits$Country==selected_country,]
  sf_use_s2(FALSE)
  # Convert sp objects to sf
  notaries_country <- sf::st_as_sf(notaries_country)
  municipalities_country <- sf::st_as_sf(municipalities_country)
  
  # use sf functions
  municipalities_country$CENSUS_ID<-factor(municipalities_country$CENSUS_ID)
  if (i==1) {
    #Project the notaries onto the layer with the municipalities of the country
    notaries_with_municipal_ID <- sf::st_intersection(notaries_country, municipalities_country)
  }
  if (i>1){
    #Add the newly merged data to the notary information
    notaries_with_municipal_ID<-rbind(notaries_with_municipal_ID,sf::st_intersection(notaries_country, municipalities_country))
  }
}

rm(notaries_country,municipalities_country,i,selected_country)
#Generate a dataset with the number of notaries in each municipality based on the merging.
number_of_notaries_country<-as.data.frame(table(notaries_with_municipal_ID$CENSUS_ID))
number_of_notaries_country<-rename(number_of_notaries_country,c("Var1"="CENSUS_ID","Freq"="Notaries"))
number_of_notaries_country$Country<-as.factor(substr(number_of_notaries_country$CENSUS_ID,1,2))
#Note: 6 notaries (out of 9087) in France could not be merged, due to address/geocoding issues.

#Replace the original data with the data including CensusUnit characteristics
notary_data<-notaries_with_municipal_ID
rm(notaries_with_municipal_ID)

#Calculate the number of offices by checking for the same address and municipal ID.
notary_data$Office_ID<-as.numeric(as.factor(notary_data$address))
offices<-as.data.frame(notary_data[,c("CENSUS_ID","Office_ID")])
offices<-offices[,c("CENSUS_ID","Office_ID")]
offices<-unique(offices)
offices<-as.data.frame(table(offices$CENSUS_ID))
offices<-rename(offices,c("Freq"="NotOffices","Var1"="CENSUS_ID"))

CensusUnits<-merge(CensusUnits,number_of_notaries_country,by="CENSUS_ID")
CensusUnits<-merge(CensusUnits,offices,by="CENSUS_ID")

rm(offices,crs_census,notary_data,number_of_notaries_country,EU_8)
CensusUnits$Country<-CensusUnits$Country.x
CensusUnits$Country.x<-NULL
CensusUnits$Country.y<-NULL
write.dta(as.data.frame(CensusUnits), "Generated_data/country_data.dta") 